{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Action3  使用Gensim中的Word2Vec对三国演义进行Word  Embedding，分析和曹操最相近的词有哪些，曹操+刘备-张飞=?     数据集：three_kingdoms.txt      \n",
    "\n",
    "1、完成代码，结果正确（20points）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*-coding: utf-8 -*-\n",
    "# 先运行 word_seg进行中文分词，然后再进行word_similarity计算\n",
    "# 将Word转换成Vec，然后计算相似度 \n",
    "from gensim.models import word2vec\n",
    "import multiprocessing\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# 如果目录中有多个文件，可以使用PathLineSentences\n",
    "segment_folder = './three_kingdoms/segment'\n",
    "sentences = word2vec.PathLineSentences(segment_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置模型参数，进行训练\n",
    "model = word2vec.Word2Vec(sentences, size=100, window=3, min_count=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9836955"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.wv.similarity('曹操', '刘备')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('主公', 0.9968603253364563),\n",
       " ('丞相', 0.9960360527038574),\n",
       " ('商议', 0.9959854483604431),\n",
       " ('臣', 0.9948604702949524),\n",
       " ('歆', 0.9945951104164124),\n",
       " ('既', 0.9944403171539307),\n",
       " ('朕', 0.9942382574081421),\n",
       " ('此人', 0.994074285030365),\n",
       " ('卿', 0.993657648563385),\n",
       " ('瑾', 0.9935004115104675)]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置模型参数，进行训练\n",
    "model2 = word2vec.Word2Vec(sentences, size=128, window=5, min_count=5, workers=multiprocessing.cpu_count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词向量的长度为128\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([ 6.08580768e-01, -2.49197036e-02,  1.64827511e-01,  8.37357622e-03,\n",
       "       -4.47976410e-01,  2.02559203e-01, -1.87098935e-01,  1.72011003e-01,\n",
       "        1.51644602e-01,  3.97912052e-04,  2.26882160e-01,  1.65919602e-01,\n",
       "        2.34445840e-01, -3.99728298e-01,  1.19516209e-01, -4.46973234e-01,\n",
       "        1.57894358e-01, -1.93101361e-01, -9.80055779e-02,  1.41757071e-01,\n",
       "        2.65119404e-01, -2.71634072e-01,  3.58081639e-01, -4.31689844e-02,\n",
       "       -1.62348121e-01, -2.46229216e-01, -1.31256595e-01,  2.13635355e-01,\n",
       "        1.50580029e-03, -3.14851761e-01,  5.11017255e-02, -1.37009680e-01,\n",
       "       -2.10380033e-01, -2.07180023e-01, -1.46164037e-02, -1.77646860e-01,\n",
       "        5.71174398e-02, -2.53167957e-01,  4.49619219e-02,  2.42641807e-01,\n",
       "        2.20180467e-01,  2.45785981e-01, -1.35408700e-01,  4.80356038e-01,\n",
       "       -2.85946988e-02, -2.53566474e-01,  4.86007333e-01, -2.58445263e-01,\n",
       "       -6.76099807e-02, -1.40994608e-01,  1.16484903e-01,  2.85751019e-02,\n",
       "        1.99429929e-01,  1.22567482e-01, -5.78944385e-01, -5.20291515e-02,\n",
       "       -3.76395941e-01, -2.25920975e-01,  5.49109280e-01,  2.36247808e-01,\n",
       "        1.81829438e-01, -1.88141406e-01, -7.02612638e-01,  2.55148083e-01,\n",
       "        2.20958190e-03, -2.96094567e-01, -7.70204440e-02, -3.26515198e-01,\n",
       "        3.97486508e-01,  4.86003578e-01,  7.15127766e-01,  1.04058824e-01,\n",
       "        1.99798480e-01,  3.85455728e-01, -3.05614442e-01, -8.49079669e-01,\n",
       "        3.79086845e-02,  3.42608690e-02, -3.38510066e-01,  8.85155872e-02,\n",
       "       -1.04646228e-01, -2.64541656e-01, -9.37452838e-02,  2.86226243e-01,\n",
       "        2.50198543e-01, -3.38069379e-01,  3.52674186e-01,  1.92069143e-01,\n",
       "        1.91921175e-01, -3.64188135e-01, -9.98495594e-02,  2.22107798e-01,\n",
       "       -6.65148348e-02,  5.57873726e-01,  4.56209302e-01,  3.38459492e-01,\n",
       "        2.72135794e-01,  5.13357878e-01,  1.17525935e-01, -5.48588336e-01,\n",
       "        3.30736428e-01,  2.26907969e-01,  1.89264491e-01, -4.00057852e-01,\n",
       "        1.34293139e-01,  1.65952697e-01,  7.13034943e-02,  1.54641755e-02,\n",
       "       -2.98919946e-01, -2.54280269e-01,  2.82494247e-01, -2.41627321e-01,\n",
       "        4.24832046e-01,  1.28637508e-01,  5.71718574e-01,  1.37418345e-01,\n",
       "       -1.66412905e-01,  2.94145226e-01, -3.52390081e-01, -1.40884414e-01,\n",
       "       -2.94567317e-01, -5.44494510e-01, -8.88940617e-02,  9.34066400e-02,\n",
       "        1.45189017e-01,  3.14915538e-01, -6.08124323e-02,  8.76332164e-01],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 每一个词都对应一个长度为128的向量\n",
    "embedding_dim = model2['曹操'].shape[0]\n",
    "print('词向量的长度为{}'.format(embedding_dim))\n",
    "model2['曹操']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8417533"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model2.similarity('曹操','刘备')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.84175336"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "# dot（'曹操'/|'曹操'|， '刘备'/|'刘备'| ）\n",
    "np.dot(model2['曹操']/np.linalg.norm(model2['曹操']), model2['刘备']/np.linalg.norm(model2['刘备']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('丞相', 0.9926790595054626),\n",
       " ('吾', 0.9890015721321106),\n",
       " ('此', 0.9882171750068665),\n",
       " ('何人', 0.9875189065933228),\n",
       " ('愿即', 0.9865968823432922),\n",
       " ('臣', 0.9849131107330322),\n",
       " ('朕', 0.9849110841751099),\n",
       " ('今', 0.9840759038925171),\n",
       " ('何故', 0.9835803508758545),\n",
       " ('叹', 0.9826781749725342)]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model2.most_similar(positive=['曹操', '刘备'], negative=['张飞'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('孙权', 0.981737494468689),\n",
       " ('袁绍', 0.9794854521751404),\n",
       " ('关将军', 0.9786468744277954),\n",
       " ('往', 0.9785541296005249),\n",
       " ('众将', 0.978034257888794),\n",
       " ('不悦', 0.9755598306655884),\n",
       " ('老人', 0.9754760265350342),\n",
       " ('周瑜', 0.9751571416854858),\n",
       " ('夫人', 0.9748931527137756),\n",
       " ('作歌', 0.9748011231422424)]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 找出最相近的词，余弦相似度\n",
    "model2.most_similar(positive=['曹操'], topn=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "在 曹操 孙权 罗贯中 关公 获胜 中:\n",
      "不是同一类别的词为: 获胜\n"
     ]
    }
   ],
   "source": [
    "# 找出不同的词\n",
    "test_words = '曹操 孙权 罗贯中 关公 获胜'\n",
    "test_words_result = model2.doesnt_match(test_words.split())\n",
    "print('在 '+test_words+' 中:\\n不是同一类别的词为: %s' %test_words_result)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.0 64-bit ('Bi_env': venv)",
   "language": "python",
   "name": "python38064bitbienvvenvba07af95a1bb4b078aa8134bba84dff2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
